dat %>%
ggplot(aes(year, lakeid, fill=is.na(daynum))) +
geom_tile(color="grey") +
theme_bw() +
facet_wrap(~metric, nrow=6)
Remaining data issues:
Predict DOC daynum from other variable daynum’s in each northern lake
no_dups = dat %>% group_by(lakeid, year, metric) %>% summarise(N = n()) %>% filter(N == 1)
## `summarise()` has grouped output by 'lakeid', 'year'. You can override using
## the `.groups` argument.
n_lakes_wide = left_join(no_dups, dat) %>%
select(-sampledate) %>%
pivot_wider(names_from = metric, values_from = daynum) %>%
filter(lakeid %in% c("AL", "BM", "CB", "CR", "SP", "TB", "TR"))
## Joining, by = c("lakeid", "year", "metric")
hold_data = na.omit(data.frame(n_lakes_wide %>% ungroup() %>% select(-year, -N))) # , -doc_predict
all = lm(doc_epiMax~(iceon+straton+stratoff+energy+
stability+anoxia_summer+
secchi_max)*lakeid, data=hold_data)
i_o = lm(doc_epiMax~1, data=hold_data)
hold_step = step(i_o, scope=formula(all))
## Start: AIC=1760.48
## doc_epiMax ~ 1
##
## Df Sum of Sq RSS AIC
## + lakeid 6 75207 392168 1732.0
## + stratoff 1 12973 454402 1756.0
## <none> 467375 1760.5
## + straton 1 1124 466251 1761.9
## + iceon 1 606 466769 1762.2
## + secchi_max 1 606 466769 1762.2
## + energy 1 436 466939 1762.3
## + stability 1 208 467167 1762.4
## + anoxia_summer 1 179 467196 1762.4
##
## Step: AIC=1731.95
## doc_epiMax ~ lakeid
##
## Df Sum of Sq RSS AIC
## + stratoff 1 3892 388276 1731.7
## <none> 392168 1732.0
## + anoxia_summer 1 3132 389036 1732.1
## + energy 1 2445 389723 1732.5
## + straton 1 530 391638 1733.6
## + iceon 1 510 391658 1733.7
## + stability 1 394 391774 1733.7
## + secchi_max 1 328 391840 1733.8
## - lakeid 6 75207 467375 1760.5
##
## Step: AIC=1731.65
## doc_epiMax ~ lakeid + stratoff
##
## Df Sum of Sq RSS AIC
## <none> 388276 1731.7
## + anoxia_summer 1 3273 385004 1731.7
## + energy 1 3036 385240 1731.8
## - stratoff 1 3892 392168 1732.0
## + straton 1 649 387627 1733.3
## + stability 1 603 387673 1733.3
## + iceon 1 374 387903 1733.4
## + secchi_max 1 247 388029 1733.5
## + stratoff:lakeid 6 6943 381333 1739.5
## - lakeid 6 66125 454402 1756.0
doc_model = lm(doc_epiMax~(iceon+straton+stratoff+energy+
stability+anoxia_summer+
secchi_max)*lakeid,
data=n_lakes_wide,
na.action = na.exclude)
summary(doc_model)
##
## Call:
## lm(formula = doc_epiMax ~ (iceon + straton + stratoff + energy +
## stability + anoxia_summer + secchi_max) * lakeid, data = n_lakes_wide,
## na.action = na.exclude)
##
## Residuals:
## Min 1Q Median 3Q Max
## -150.321 -17.397 2.832 23.126 119.066
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 77.29634 170.93863 0.452 0.6517
## iceon 0.25799 0.36315 0.710 0.4784
## straton 0.22299 0.36107 0.618 0.5376
## stratoff 0.44312 0.31556 1.404 0.1620
## energy -0.20198 0.22429 -0.901 0.3690
## stability -0.14405 0.14826 -0.972 0.3326
## anoxia_summer -0.04084 0.17612 -0.232 0.8169
## secchi_max -0.07579 0.08286 -0.915 0.3616
## lakeid.L 319.71870 449.77873 0.711 0.4781
## lakeid.Q 357.93277 440.74822 0.812 0.4178
## lakeid.C 235.35696 456.57259 0.515 0.6068
## lakeid^4 129.67020 457.05136 0.284 0.7770
## lakeid^5 -416.46790 463.26683 -0.899 0.3699
## lakeid^6 295.39872 445.76920 0.663 0.5084
## iceon:lakeid.L -1.09045 0.96020 -1.136 0.2576
## iceon:lakeid.Q -0.62871 0.94821 -0.663 0.5082
## iceon:lakeid.C -0.47248 0.95510 -0.495 0.6214
## iceon:lakeid^4 0.19702 0.99387 0.198 0.8431
## iceon:lakeid^5 0.44611 0.94996 0.470 0.6392
## iceon:lakeid^6 -0.24304 0.95668 -0.254 0.7997
## straton:lakeid.L -0.99644 0.96684 -1.031 0.3041
## straton:lakeid.Q -0.06648 0.88073 -0.075 0.9399
## straton:lakeid.C 0.21534 0.97934 0.220 0.8262
## straton:lakeid^4 1.02889 1.02504 1.004 0.3168
## straton:lakeid^5 1.52519 0.99168 1.538 0.1258
## straton:lakeid^6 0.39826 0.87856 0.453 0.6509
## stratoff:lakeid.L -0.22517 0.84243 -0.267 0.7896
## stratoff:lakeid.Q -0.27602 0.75215 -0.367 0.7141
## stratoff:lakeid.C 0.17399 0.84559 0.206 0.8372
## stratoff:lakeid^4 -0.52386 0.94017 -0.557 0.5781
## stratoff:lakeid^5 0.87622 0.84874 1.032 0.3033
## stratoff:lakeid^6 -1.22688 0.76657 -1.600 0.1112
## energy:lakeid.L 0.35302 0.59698 0.591 0.5550
## energy:lakeid.Q -0.31656 0.55245 -0.573 0.5674
## energy:lakeid.C -0.19354 0.59750 -0.324 0.7464
## energy:lakeid^4 -0.21917 0.64995 -0.337 0.7364
## energy:lakeid^5 -0.43207 0.59803 -0.722 0.4709
## energy:lakeid^6 0.09195 0.56057 0.164 0.8699
## stability:lakeid.L -0.07321 0.34534 -0.212 0.8324
## stability:lakeid.Q 0.22801 0.37854 0.602 0.5477
## stability:lakeid.C -0.11656 0.39617 -0.294 0.7689
## stability:lakeid^4 -0.45500 0.34578 -1.316 0.1899
## stability:lakeid^5 -0.14418 0.44118 -0.327 0.7442
## stability:lakeid^6 -0.04854 0.43532 -0.112 0.9113
## anoxia_summer:lakeid.L 0.29492 0.47623 0.619 0.5365
## anoxia_summer:lakeid.Q 0.16496 0.46771 0.353 0.7247
## anoxia_summer:lakeid.C -0.07344 0.45345 -0.162 0.8715
## anoxia_summer:lakeid^4 -0.19684 0.50500 -0.390 0.6972
## anoxia_summer:lakeid^5 -0.16585 0.42945 -0.386 0.6998
## anoxia_summer:lakeid^6 0.05669 0.46058 0.123 0.9022
## secchi_max:lakeid.L 0.50747 0.23469 2.162 0.0319 *
## secchi_max:lakeid.Q -0.44115 0.23547 -1.873 0.0626 .
## secchi_max:lakeid.C -0.17325 0.22536 -0.769 0.4430
## secchi_max:lakeid^4 -0.07064 0.19545 -0.361 0.7182
## secchi_max:lakeid^5 -0.14772 0.21562 -0.685 0.4942
## secchi_max:lakeid^6 0.13562 0.20575 0.659 0.5106
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 42.9 on 180 degrees of freedom
## (37 observations deleted due to missingness)
## Multiple R-squared: 0.2926, Adjusted R-squared: 0.07644
## F-statistic: 1.354 on 55 and 180 DF, p-value: 0.072
n_lakes_wide$doc_predict = predict(doc_model, n_lakes_wide)
cor = n_lakes_wide %>%
group_by(lakeid) %>%
summarise(r = paste("r =", round(cor(doc_epiMax, doc_predict, use="complete.obs"), 3)))
n_lakes_wide %>%
ggplot(aes(x=doc_epiMax, y=doc_predict, color=as.character(lakeid))) +
geom_point()+
theme_bw() +
labs(color="lakeid")+
geom_abline(slope=1, intercept = 0) +
facet_wrap(~lakeid) +
geom_text(aes(label=r), data=cor, x=300, y=0) +
labs(x="obs peak DOC (daynum)", y="modeled peak DOC (daynum)")
## Warning: Removed 37 rows containing missing values (`geom_point()`).
# not good but vaguely positive relationship; use this instead of median?
missing_doc = n_lakes_wide %>% filter(is.na(doc_epiMax))
missing_doc$doc_fill = round(predict(doc_model, missing_doc))
doc_fill = missing_doc %>%
select(lakeid, year, doc_fill) %>%
rename(daynum_fill = doc_fill) %>%
mutate(metric = "doc_epiMax") %>%
filter(year < 2000)
all_fill = bind_rows(doc_fill)
dat_filled = full_join(dat, all_fill) %>%
mutate(filled_flag = ifelse(is.na(daynum) & !is.na(daynum_fill), TRUE, FALSE)) %>%
mutate(daynum_fill = ifelse(is.na(daynum) & !is.na(daynum_fill), daynum_fill, daynum))
## Joining, by = c("lakeid", "metric", "year")
dat_filled$lakeid = factor(dat_filled$lakeid,
levels = c("AL", "BM", "CB", "CR", "SP", "TB", "TR", "FI", "ME", "MO", "WI"),
ordered = T)
vars = c("iceoff", "straton", "secchi_all", "secchi_max","secchi_min", "zoopDensity","zoopDensity_CC",
"doc_epiMax","totpuf_epiMax", "totpuf_epiMin", "totpuf_hypoMax", "totpuf_hypoMin",
"anoxia", "anoxia_summer", "stability", "energy", "stratoff", "iceon")
dat_filled$metric = factor(dat_filled$metric,
levels = rev(vars),
ordered=T)
dat_filled %>%
ggplot(aes(year, metric, fill=filled_flag)) +
geom_tile(color="grey") +
theme_bw() +
facet_wrap(~lakeid, nrow=6)
## Additional trimming/filling
Trim to “good” years, fill FI ice data with Monona, then fill additional missing with median values for each lake/metric
df_yearsWant = dat_filled %>%
filter((lakeid %in% c("FI", "ME", "MO", "WI") & year %in% 1996:2018) |
(!(lakeid %in% c("FI", "ME", "MO", "WI")) & year %in% 1982:2018))
fi_icefill = df_yearsWant %>%
filter(lakeid == "MO" & metric %in% c("iceoff", "iceon")) %>%
mutate(lakeid = "FI") %>%
select(-daynum, -filled_flag, -sampledate) %>%
rename(icefill = daynum_fill)
df_yearsWant = df_yearsWant %>%
full_join(fi_icefill) %>%
mutate(daynum_fill = ifelse(is.na(daynum_fill) & !is.na(icefill),
icefill, daynum),
filled_flag = ifelse(is.na(daynum) & !is.na(daynum_fill) & !is.na(icefill),
TRUE, filled_flag))
## Joining, by = c("lakeid", "metric", "year")
df_yearsWant %>%
ggplot(aes(year, metric, fill=filled_flag)) +
geom_tile(color="grey") +
theme_bw() +
facet_wrap(~lakeid, nrow=6)
all_lys = df_yearsWant %>%
select(lakeid, year) %>%
distinct()
metric = unique(df_yearsWant$metric)
all_lys_want = expand_grid(all_lys, metric)
dat_final = left_join(all_lys_want, df_yearsWant)
## Joining, by = c("lakeid", "year", "metric")
medians = dat_final %>%
group_by(lakeid, metric) %>%
summarise(median_daynum = median(daynum_fill, na.rm=T))
## `summarise()` has grouped output by 'lakeid'. You can override using the
## `.groups` argument.
dat_final = dat_final %>%
left_join(medians) %>%
mutate(daynum_fill = ifelse(is.na(daynum_fill), median_daynum, daynum_fill)) %>%
mutate(filled_flag = ifelse(is.na(daynum) & !is.na(daynum_fill), TRUE, filled_flag)) %>%
select(-icefill, -median_daynum)
## Joining, by = c("lakeid", "metric")
dat_final %>%
ggplot(aes(year, metric, fill=filled_flag)) +
geom_tile(color="grey") +
theme_bw() +
facet_wrap(~lakeid, nrow=6)
write_csv(dat_final, "Data/analysis_ready/final_combined_dates_filled_v2.csv")